Tidy bio data

Map grades to school level to detirmine elementary, middle and high school

We assumed: * k-5 is elementary school * 6-8 is middle school * 9-12 is high school Calculations were made with this in mind. We are aware that some schools are only k-2, so students will be zoned differently for grade 3, but we are interested in the zone of kindergarten/lowest grade a students entered the system.

grades <- c(0, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12)
schools <-c("em", "em", "em" , "em" , "em", "em", "mid" , "mid", "mid", "hs", "hs", "hs",  "hs")
map_grade <- data.frame(grades, schools)

‘regrade’ the grade_level so that all grades have a numeric value.

K is 0, all other grades before K are dropped because students are not zoned for school until Kindergarten.

all_bios <- bios

#bios_noNA <- all_bios%>% 
 # filter(!is.na(student_id_scram)) #only one student id is na

bios_dbn <- all_bios %>% 
  select(student_id_scram, dbn, year, grade_level)

bios_dbn <- bios_dbn %>% mutate(grade_level = ifelse(grade_level == "0K", "00", grade_level))
bios_dbn <- bios_dbn %>% mutate(grade_level = as.integer(grade_level)) %>% filter(!is.na(grade_level))
## Warning in evalq(as.integer(grade_level), <environment>): NAs introduced by
## coercion
#join data so you see students
bios_dbn <- bios_dbn %>% full_join(map_grade, by = c("grade_level" = "grades"))
################################ MOST IMPORTANT ###############################################
test<- bios_dbn %>% group_by(student_id_scram, schools, year, dbn) %>% summarise() 

#only keep entry to each school level for each student
bio_distinct_schools <- test %>% group_by(student_id_scram, schools) %>% filter(year == min(year))
####################################################################################################

#view
#bios_dbn %>% group_by(student_id_scram, dbn, schools) %>% summarise()%>% arrange(student_id_scram) %>% View()

smaller_data <- bios_dbn %>% group_by(student_id_scram, dbn, schools) %>% summarise()

unique_data <- smaller_data %>% group_by(student_id_scram, schools) %>% 
  distinct(student_id_scram, schools, .keep_all = TRUE) 

#it works!
# unique_data %>% 
#   group_by(student_id_scram) %>%
#   summarise(count =n())%>%
#   group_by(count) %>%
#   summarise(countx = n())

#spread this now to get distance from zone to school. IMPORTANT: We only took one mid, high, em for each student. 
#Some students changed schools, and we would not see this from this table.
#################################################################
bios_spread <- unique_data %>% spread(schools, dbn)
##################################################################

#display bio_distinct_schools

bio_distinct_schools
## # A tibble: 3,794,610 x 4
## # Groups:   student_id_scram, schools [3,794,610]
##    student_id_scram schools  year    dbn
##               <int>  <fctr> <dbl>  <chr>
##  1             2160      em  2005 20K185
##  2             2160      hs  2010 02M475
##  3             2160     mid  2007 20K187
##  4             2240      hs  2005 08X650
##  5             2259      em  2013 21K188
##  6             2320      em  2005 04M050
##  7             2320      hs  2010 02M655
##  8             2320     mid  2007 04M050
##  9             2339      em  2005 11X078
## 10             2339      hs  2009 08X405
## # ... with 3,794,600 more rows

Tidy zone data

#############################
#ZONE DATA DOES NOT INCLUDE ANYTHING FROM 2005 or 2006
#############################

#sanity checks are commented

# zoneData %>% select(student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn, year) %>%
#   group_by(student_id_scram) %>% summarise(count = n()) %>% group_by(count) %>% summarise(countx = n()) 

# zoneData %>% select(student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn, year) %>%
#   group_by(student_id_scram) %>% summarise(count = n()) %>% 
#   filter(count == 9) %>% head(1)

#zoneData %>% filter(student_id_scram == 12507)

distinct_zones <- zoneData %>% select(1:8) %>%  #used to be callued df_distinct
  group_by(student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn, year) %>% 
  distinct(student_id_scram, schools, .keep_all = TRUE) 

#df_distinct %>% group_by(year) %>% summarise()

distinct_zones
## # A tibble: 9,502,347 x 8
## # Groups:   student_id_scram, zoned_elm_dbn, zoned_mid_dbn, zoned_hs_dbn,
## #   year [9,502,347]
##    student_id_scram res_zip_cde zoned_elm_dbn zoned_mid_dbn zoned_hs_dbn
##               <int>       <chr>         <chr>         <chr>        <chr>
##  1        126202521       10009        01M015        01M973       71M985
##  2        892202621       10463        10X007        10X368       72X981
##  3        173202621       10009        01M063        01M972       71M985
##  4        428202887       10009        01M064        01M973       71M985
##  5        425202898       10002        01M188        01M973       71M985
##  6        452202349       10461        11X108        11X144       11X415
##  7        205202349       10009        01M064        01M972       71M985
##  8        775202349       10009        01M015        01M973       71M985
##  9        730302510       10002        01M974        01M973       71M985
## 10        862302510       10009        01M064        01M973       71M985
## # ... with 9,502,337 more rows, and 3 more variables: census_block <chr>,
## #   census_tract <chr>, year <dbl>

Join zone and bio data on DBN

We joined the data on DBNs from the zoned data and the DBN’s from the bio data on each elementary, middle and high school. It is important to note that zone data only begins from 2007. For students who began elementary, middle or high school before 2007, we do NOT have the zoned dbn. Therefore, naturally these students are not included on maps of attended school vs zoned school. For all other counts, these students are included.

Data Quirks: Students are zoned to an elementary, middle and high school every time he/she is zoned. We had to detirmine the year a student entered elementary school, for example, and then find that year for that student in the zoned data to detirmine the zone that student was mapped to the year he/she entered elementary school. Zones change quite frequently and this granular data was important for the numbers later on.

# reformat df to wide format
bio_distinct_schools <- bio_distinct_schools %>% spread(schools, dbn)

#join the data frames. All records that do not have a match are dropped. This ensures that the map will only include data that exists
join_zone_bio <- inner_join(bio_distinct_schools, distinct_zones, by = c("year", "student_id_scram"))

join_zone_bio
## # A tibble: 2,247,484 x 11
## # Groups:   student_id_scram [?]
##    student_id_scram  year     em     hs    mid res_zip_cde zoned_elm_dbn
##               <int> <dbl>  <chr>  <chr>  <chr>       <chr>         <chr>
##  1             2160  2007   <NA>   <NA> 20K187       11209        20K185
##  2             2160  2010   <NA> 02M475   <NA>       11209        20K185
##  3             2259  2013 21K188   <NA>   <NA>       11224        21K188
##  4             2320  2007   <NA>   <NA> 04M050       10029        04M050
##  5             2320  2010   <NA> 02M655   <NA>       10029        04M050
##  6             2339  2007   <NA>   <NA> 11X144       10469        11X078
##  7             2339  2009   <NA> 08X405   <NA>       10469        11X111
##  8             2400  2013 30Q002   <NA>   <NA>       11370        30Q002
##  9             2551  2012 02M151   <NA>   <NA>       10128        02M151
## 10             2612  2007   <NA>   <NA> 21K303       11235        21K100
## # ... with 2,247,474 more rows, and 4 more variables: zoned_mid_dbn <chr>,
## #   zoned_hs_dbn <chr>, census_block <chr>, census_tract <chr>

Match zoned and attended schools

  • 0 has em dbn, but does not match
  • 1 has em dbn and matches
  • -1 does NOT have em dbn
join_zone_bio_matches <- join_zone_bio

join_zone_bio_matches <- join_zone_bio_matches%>% 
  mutate(matchEM = ifelse(!is.na(zoned_elm_dbn), ifelse(zoned_elm_dbn == em, 1, 0), -1))

join_zone_bio_matches <- join_zone_bio_matches%>% 
  mutate(matchMID = ifelse(!is.na(zoned_mid_dbn), ifelse(zoned_mid_dbn == mid, 1, 0), -1))

join_zone_bio_matches <- join_zone_bio_matches%>% 
  mutate(matchHS = ifelse(!is.na(zoned_hs_dbn), ifelse(zoned_hs_dbn == em, 1, 0), -1))

Elementary school data

add a collumn for boro * note that some DBNs are not in district-boro-n format.

# some dbns don't have a 0 boro and the dbn is in not in standard dbn format
# 69,974 have a zoned boro of zero, 168,331 have a standard zoned boro

em_data <- join_zone_bio_matches %>%filter(!is.na(em)) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3))
em_data
## # A tibble: 815,633 x 15
## # Groups:   student_id_scram [815,633]
##    student_id_scram  year     em    hs   mid res_zip_cde zoned_elm_dbn
##               <int> <dbl>  <chr> <chr> <chr>       <chr>         <chr>
##  1             2259  2013 21K188  <NA>  <NA>       11224        21K188
##  2             2400  2013 30Q002  <NA>  <NA>       11370        30Q002
##  3             2551  2012 02M151  <NA>  <NA>       10128        02M151
##  4            12003  2008 75X186  <NA>  <NA>       10467        11X103
##  5            12116  2009 09X449  <NA>  <NA>       10452        09X114
##  6            12507  2007 17K167  <NA>  <NA>       11213        17K398
##  7            12772  2007 24Q071  <NA>  <NA>       11385        24Q071
##  8            22002  2009 17K191  <NA>  <NA>       11233        17K191
##  9            22016  2010 06M004  <NA>  <NA>       10032        06M004
## 10            22059  2010 18K233  <NA>  <NA>       11212        18K219
## # ... with 815,623 more rows, and 8 more variables: zoned_mid_dbn <chr>,
## #   zoned_hs_dbn <chr>, census_block <chr>, census_tract <chr>,
## #   matchEM <dbl>, matchMID <dbl>, matchHS <dbl>, zoned_boro <chr>
#%>% 

  #group_by(zoned_boro) %>% 
  #  summarise(count = n())

Number of students zoned to each elementary school

# number of students zoned to a school (includes all years of data), we don't know how many seats each school has
em_data %>% group_by(zoned_elm_dbn) %>% summarise(num_zoned_to_dbn = n())
## # A tibble: 888 x 2
##    zoned_elm_dbn num_zoned_to_dbn
##            <chr>            <int>
##  1         00678                1
##  2         00921                1
##  3         00976                1
##  4        01M015               41
##  5        01M019               56
##  6        01M020               42
##  7        01M034               36
##  8        01M063               62
##  9        01M064               91
## 10        01M110               47
## # ... with 878 more rows
# number of students who went to a specific elementary school
local_zones_data <- full_join( (em_data %>% group_by(zoned_elm_dbn) %>% summarise(zoned_to_dbn = n())), 
            (em_data %>% group_by(em) %>% summarise(went_to_dbn = n())),
           by = c("zoned_elm_dbn" = "em") )

local_zones_data <- full_join( local_zones_data, 
           (join_zone_bio_matches %>% filter(matchEM == 1) %>% group_by(zoned_elm_dbn) %>% 
   summarise(attending_zoned_dbn = n())),   by = "zoned_elm_dbn")

local_zones_data <- local_zones_data %>% mutate(perc_local = (attending_zoned_dbn/went_to_dbn)*100)
local_zones_data %>% mutate(boro = substr(zoned_elm_dbn, 3, 3)) %>% filter(!is.na(attending_zoned_dbn)) %>% group_by(boro) %>% summarize(sum_attended_zoned=sum(attending_zoned_dbn), sum_zoned= sum(zoned_to_dbn)) %>%
  mutate(perc_boro_att_zoned = (sum_attended_zoned/sum_zoned)*100)
## # A tibble: 5 x 4
##    boro sum_attended_zoned sum_zoned perc_boro_att_zoned
##   <chr>              <int>     <int>               <dbl>
## 1     K             141682    227647            62.23759
## 2     M              47835     81812            58.46942
## 3     Q             156370    206773            75.62399
## 4     R              34133     43639            78.21673
## 5     X             100669    155324            64.81226
local_zones_data
## # A tibble: 1,313 x 5
##    zoned_elm_dbn zoned_to_dbn went_to_dbn attending_zoned_dbn perc_local
##            <chr>        <int>       <int>               <int>      <dbl>
##  1         00678            1          NA                  NA         NA
##  2         00921            1          NA                  NA         NA
##  3         00976            1          NA                  NA         NA
##  4        01M015           41         309                   5   1.618123
##  5        01M019           56         468                  10   2.136752
##  6        01M020           42         951                  21   2.208202
##  7        01M034           36         323                   8   2.476780
##  8        01M063           62         299                  11   3.678930
##  9        01M064           91         412                  21   5.097087
## 10        01M110           47         614                  13   2.117264
## # ... with 1,303 more rows

{r} # # # some dbns don't have a 0 boro and the dbn is in not in standard dbn format # match_df %>% filter(match_dbn == 0) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3)) #%>% # group_by(zoned_boro) %>% summarise(count = n()) # # # 69974 have a zoned boro of zero, 168331 have a standard zoned boro # match_df %>% filter(match_dbn == 0) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3)) %>% # mutate(to_boro = substr(BioDbn, 3, 3)) %>% group_by(zoned_boro == 0) %>% summarise(count = n()) # # # count of students who went to # match_df %>% filter(match_dbn == 0) %>% mutate(zoned_boro = substr(zoned_elm_dbn, 3,3)) %>% # mutate(to_boro = substr(BioDbn, 3, 3)) # # # number of students zoned to a school # match_df %>% group_by(zoned_elm_dbn) %>% summarise(zoned_to_dbn = n()) # # # number of studebts who went to a specific school # local_zones_data <- full_join( (match_df %>% group_by(zoned_elm_dbn) %>% summarise(zoned_to_dbn = n())), # (match_df %>% group_by(BioDbn) %>% summarise(went_to_dbn = n())), # by = c("zoned_elm_dbn" = "BioDbn") ) # # local_zones_data <- full_join( local_zones_data, # (match_df %>% filter(match_dbn == 1) %>% group_by(zoned_elm_dbn) %>% # summarise(attending_zoned_dbn = n())), # by = "zoned_elm_dbn") # # local_zones_data <- local_zones_data %>% mutate(perc_local = (attending_zoned_dbn/went_to_dbn)*100) # local_zones_data %>% mutate(boro = substr(zoned_elm_dbn, 3, 3)) %>% filter(!is.na(attending_zoned_dbn)) %>% group_by(boro) %>% summarize(sum_attended_zoned=sum(attending_zoned_dbn), sum_zoned= sum(zoned_to_dbn)) %>% # mutate(perc_boro_att_zoned = (sum_attended_zoned/sum_zoned)*100) # # # local_zones_data #

#get map dbns data nyc open data
r<- GET("https://data.cityofnewyork.us/api/geospatial/cq6p-iwiy?method=export&format=GeoJSON")
dbns <- readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
## No encoding supplied: defaulting to UTF-8.
## Warning in readOGR(content(r, "text"), "OGRGeoJSON", verbose = F): Dropping
## null geometries: 752
#dbns@polygons
#dbns@polygons[[1]]@Polygons[[1]]@coords[1,]
#dbns@polygons[[2]]@Polygons[[1]]@coords[1,]
#as.data.frame()
df_spatial_dbns <- as.data.frame(dbns@data$dbn)


# df_lat_long
# for (i in 1:2) {
#   df_lat_long[i] <- list(dbns@polygons[[i]]@Polygons[[1]]@coords[1,])
# }

#loop through polygons to get lat longs
df_lat_long <- vector(mode = "list", length = 779)
for (i in 1:779) {
  df_lat_long[[i]] <- dbns@polygons[[i]]@Polygons[[1]]@coords[1,]
}

lat_long <- do.call(rbind, df_lat_long)

dbns_latlong_bind <- cbind(df_spatial_dbns, lat_long)

colnames(dbns_latlong_bind)[1] <- "dbn"
colnames(dbns_latlong_bind)[2] <- "lat"
colnames(dbns_latlong_bind)[3] <- "long"

#elem_bio <- df %>% filter(!is.na(zoned_elm_dbn))

#map_data <- merge(, by.x = "dbn", by.y = "")

Map of school zones with DBN locations

leaflet(dbns) %>%
  addTiles() %>% 
  addPolygons(popup = ~paste("School DBN:", dbn))%>%
  addProviderTiles("CartoDB.Positron")%>%
  setView(-73.98, 40.75, zoom = 13)

# Map of percent of students in each zone who attended the school he/she was zoned to

map_data <- merge(dbns, local_zones_data, by.x = "dbn", by.y ="zoned_elm_dbn")


# how many students went to their local school
leaflet(map_data) %>%
  addTiles() %>% 
  addPolygons(popup = ~paste("Percent Local:", round(perc_local),"%"))%>%
  addProviderTiles("CartoDB.Positron")%>%
  setView(-73.98, 40.75, zoom = 13)
#addCircles(~lng, ~lat, popup=ct$type, weight = 3, radius=40, 
 #                color="#ffa500", stroke = TRUE, fillOpacity = 0.8) 
# leaflet(map_data) %>%
#   addTiles() %>% 
#   addWebGLHeatmap()%>%
#   addPolygons(popup = ~paste("Percent Local:", round(perc_local),"%"))%>%
#   addProviderTiles("CartoDB.Positron")%>%
#   setView(-73.98, 40.75, zoom = 13)
# 
# leaflet(map_data) %>% 
#   addProviderTiles(providers$CartoDB) %>%
#   setView(-73.98, 40.75, zoom = 13 ) %>%
#   addHeatmap(~ , ~, intensity = ~dbn,
#              blur = 20, max = 0.05, radius = 15)

pal <- colorNumeric(palette = "Blues",
                    domain = range(map_data@data$perc_local, na.rm=T))

leaflet(map_data) %>%
  #addTiles() %>% 
  addPolygons(fillColor = ~pal(perc_local), popup = ~perc_local, weight = 0) %>% 
  addProviderTiles("CartoDB.Positron") %>%
  setView(-73.98, 40.75, zoom = 13)